데이터 : 2020 epl game data
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import warnings
pd.set_option('display.max_rows',100)
warnings.filterwarnings('ignore')
data = pd.read_csv('C:/Users/hyoseok/Desktop/data_science practice/epl_data_analysis/epl2020.csv')
data.head()
| Unnamed: 0 | h_a | xG | xGA | npxG | npxGA | deep | deep_allowed | scored | missed | ... | AF.x | AC.x | AY.x | AR.x | B365H.x | B365D.x | B365A.x | HtrgPerc | AtrgPerc | matchDay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | h | 2.234560 | 0.842407 | 2.234560 | 0.842407 | 11 | 5 | 4 | 1 | ... | 9 | 2 | 2 | 0 | 1.14 | 10.0 | 19.00 | 0.466667 | 0.416667 | Fri |
| 1 | 2 | a | 0.842407 | 2.234560 | 0.842407 | 2.234560 | 5 | 11 | 1 | 4 | ... | 9 | 2 | 2 | 0 | 1.14 | 10.0 | 19.00 | 0.466667 | 0.416667 | Fri |
| 2 | 3 | a | 3.183770 | 1.200300 | 2.422640 | 1.200300 | 9 | 1 | 5 | 0 | ... | 13 | 1 | 2 | 0 | 12.00 | 6.5 | 1.22 | 0.600000 | 0.642857 | Sat |
| 3 | 4 | h | 1.200300 | 3.183770 | 1.200300 | 2.422640 | 1 | 9 | 0 | 5 | ... | 13 | 1 | 2 | 0 | 12.00 | 6.5 | 1.22 | 0.600000 | 0.642857 | Sat |
| 4 | 5 | h | 1.340990 | 1.598640 | 1.340990 | 1.598640 | 4 | 6 | 1 | 1 | ... | 19 | 4 | 1 | 0 | 1.95 | 3.6 | 3.60 | 0.230769 | 0.375000 | Sat |
5 rows × 45 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 576 entries, 0 to 575 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 576 non-null int64 1 h_a 576 non-null object 2 xG 576 non-null float64 3 xGA 576 non-null float64 4 npxG 576 non-null float64 5 npxGA 576 non-null float64 6 deep 576 non-null int64 7 deep_allowed 576 non-null int64 8 scored 576 non-null int64 9 missed 576 non-null int64 10 xpts 576 non-null float64 11 result 576 non-null object 12 date 576 non-null object 13 wins 576 non-null int64 14 draws 576 non-null int64 15 loses 576 non-null int64 16 pts 576 non-null int64 17 npxGD 576 non-null float64 18 teamId 576 non-null object 19 ppda_cal 576 non-null float64 20 allowed_ppda 576 non-null float64 21 matchtime 576 non-null int64 22 tot_points 576 non-null int64 23 round 576 non-null int64 24 tot_goal 576 non-null int64 25 tot_con 576 non-null int64 26 Referee.x 576 non-null object 27 HS.x 576 non-null int64 28 HST.x 576 non-null int64 29 HF.x 576 non-null int64 30 HC.x 576 non-null int64 31 HY.x 576 non-null int64 32 HR.x 576 non-null int64 33 AS.x 576 non-null int64 34 AST.x 576 non-null int64 35 AF.x 576 non-null int64 36 AC.x 576 non-null int64 37 AY.x 576 non-null int64 38 AR.x 576 non-null int64 39 B365H.x 576 non-null float64 40 B365D.x 576 non-null float64 41 B365A.x 576 non-null float64 42 HtrgPerc 576 non-null float64 43 AtrgPerc 576 non-null float64 44 matchDay 576 non-null object dtypes: float64(13), int64(26), object(6) memory usage: 202.6+ KB
# Home 경기만 추출
home = data[data['h_a']=='h']
home.head()
| Unnamed: 0 | h_a | xG | xGA | npxG | npxGA | deep | deep_allowed | scored | missed | ... | AF.x | AC.x | AY.x | AR.x | B365H.x | B365D.x | B365A.x | HtrgPerc | AtrgPerc | matchDay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | h | 2.234560 | 0.842407 | 2.234560 | 0.842407 | 11 | 5 | 4 | 1 | ... | 9 | 2 | 2 | 0 | 1.14 | 10.00 | 19.00 | 0.466667 | 0.416667 | Fri |
| 3 | 4 | h | 1.200300 | 3.183770 | 1.200300 | 2.422640 | 1 | 9 | 0 | 5 | ... | 13 | 1 | 2 | 0 | 12.00 | 6.50 | 1.22 | 0.600000 | 0.642857 | Sat |
| 4 | 5 | h | 1.340990 | 1.598640 | 1.340990 | 1.598640 | 4 | 6 | 1 | 1 | ... | 19 | 4 | 1 | 0 | 1.95 | 3.60 | 3.60 | 0.230769 | 0.375000 | Sat |
| 6 | 7 | h | 0.909241 | 1.087520 | 0.909241 | 1.087520 | 0 | 9 | 3 | 0 | ... | 12 | 7 | 0 | 0 | 2.62 | 3.20 | 2.75 | 0.400000 | 0.272727 | Sat |
| 7 | 8 | h | 0.871590 | 1.224600 | 0.871590 | 1.224600 | 5 | 5 | 0 | 0 | ... | 14 | 2 | 1 | 1 | 3.00 | 3.25 | 2.37 | 0.333333 | 0.300000 | Sat |
5 rows × 45 columns
# 문자열 컬럼 삭제
home = home.drop(columns=['Unnamed: 0','h_a','result','date','teamId','Referee.x','matchDay'])
home.head()
| xG | xGA | npxG | npxGA | deep | deep_allowed | scored | missed | xpts | wins | ... | AST.x | AF.x | AC.x | AY.x | AR.x | B365H.x | B365D.x | B365A.x | HtrgPerc | AtrgPerc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.234560 | 0.842407 | 2.234560 | 0.842407 | 11 | 5 | 4 | 1 | 2.3863 | 1 | ... | 5 | 9 | 2 | 2 | 0 | 1.14 | 10.00 | 19.00 | 0.466667 | 0.416667 |
| 3 | 1.200300 | 3.183770 | 1.200300 | 2.422640 | 1 | 9 | 0 | 5 | 0.2522 | 0 | ... | 9 | 13 | 1 | 2 | 0 | 12.00 | 6.50 | 1.22 | 0.600000 | 0.642857 |
| 4 | 1.340990 | 1.598640 | 1.340990 | 1.598640 | 4 | 6 | 1 | 1 | 1.0172 | 0 | ... | 3 | 19 | 4 | 1 | 0 | 1.95 | 3.60 | 3.60 | 0.230769 | 0.375000 |
| 6 | 0.909241 | 1.087520 | 0.909241 | 1.087520 | 0 | 9 | 3 | 0 | 1.1422 | 1 | ... | 3 | 12 | 7 | 0 | 0 | 2.62 | 3.20 | 2.75 | 0.400000 | 0.272727 |
| 7 | 0.871590 | 1.224600 | 0.871590 | 1.224600 | 5 | 5 | 0 | 0 | 1.0512 | 0 | ... | 3 | 14 | 2 | 1 | 1 | 3.00 | 3.25 | 2.37 | 0.333333 | 0.300000 |
5 rows × 38 columns
scaler = MinMaxScaler()
scaler.fit(home)
scaled = scaler.transform(home)
print(scaled)
[[0.32468965 0.17681474 0.36768374 ... 0.71842317 0.46666667 0.41666667] [0.16580485 0.70199763 0.18776005 ... 0.00321802 0.6 0.64285714] [0.18741789 0.3464427 0.21223501 ... 0.09895414 0.23076923 0.375 ] ... [0.29308506 0.08788059 0.33189419 ... 0.1230893 0.64705882 0.33333333] [0.24854551 0.11918188 0.2814569 ... 0.01850362 0.5 0.28571429] [0.3520082 0.13869184 0.26620376 ... 0.27594529 0.46666667 0.25 ]]
차원 축소 기법 중 하나로 Column의 갯수가 많을 때 Dimension을 낮추는 역할을 한다.
#PCA 설정
pca = PCA(n_components = 4)
pca.fit(scaled)
# PCA 적용
pca_result = pca.transform(scaled)
pca_result = pd.DataFrame(data=pca_result, columns = ['PCA1','PCA2','PCA3','PCA4'])
pca_value = pd.DataFrame(data = pca.components_.T, index=home.columns)
print(pca_value)
0 1 2 3 xG -0.094734 0.015808 -0.118161 0.036680 xGA 0.140346 -0.131545 0.158305 -0.067885 npxG -0.096503 0.017722 -0.126467 0.034830 npxGA 0.137182 -0.132086 0.170624 -0.062214 deep -0.061460 0.001557 -0.153594 0.059538 deep_allowed 0.081760 -0.085197 0.230949 -0.027762 scored -0.118833 -0.030096 0.010956 0.014804 missed 0.106785 -0.057480 -0.019667 0.025020 xpts -0.252673 0.132415 -0.292995 0.089978 wins -0.537585 -0.341442 0.181727 0.001402 draws 0.106972 0.753849 0.172270 -0.127088 loses 0.430613 -0.412407 -0.353998 0.125686 pts -0.501928 -0.090159 0.239151 -0.040961 npxGD -0.123258 0.070268 -0.157451 0.050147 ppda_cal 0.063279 -0.054920 0.163821 -0.052282 allowed_ppda -0.040542 0.017905 -0.152153 0.053959 matchtime 0.002313 -0.038000 -0.066778 -0.101766 tot_points -0.060472 -0.031667 -0.145421 -0.301273 round -0.023017 -0.045469 -0.171484 -0.649686 tot_goal -0.063739 -0.043758 -0.207971 -0.362749 tot_con 0.032944 -0.049566 -0.083156 -0.488086 HS.x -0.090384 0.057853 -0.239817 0.096210 HST.x -0.124876 0.002992 -0.116020 0.031758 HF.x -0.011263 0.022994 0.001764 0.000996 HC.x -0.045859 0.063416 -0.201973 0.088770 HY.x 0.033301 0.044271 0.022442 -0.017600 HR.x 0.004588 -0.051744 0.066248 0.019078 AS.x 0.120768 -0.112266 0.300448 -0.077558 AST.x 0.107344 -0.103698 0.124777 -0.012166 AF.x 0.014959 0.005116 0.038348 -0.030747 AC.x 0.037019 -0.052745 0.165289 -0.082769 AY.x 0.009486 0.048210 -0.025238 -0.016834 AR.x -0.016473 0.053703 0.045608 0.005274 B365H.x 0.083985 -0.079951 0.145808 -0.021150 B365D.x -0.032822 -0.043475 -0.088837 0.033000 B365A.x -0.085006 0.008618 -0.179389 0.058222 HtrgPerc -0.053094 -0.040464 0.069991 -0.026693 AtrgPerc 0.045000 -0.039110 -0.084746 0.046488
home1 = home[['npxG','scored','deep','npxGA','deep_allowed','missed']]
home1.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 2.234560 | 4 | 11 | 0.842407 | 5 | 1 |
| 3 | 1.200300 | 0 | 1 | 2.422640 | 9 | 5 |
| 4 | 1.340990 | 1 | 4 | 1.598640 | 6 | 1 |
| 6 | 0.909241 | 3 | 0 | 1.087520 | 9 | 0 |
| 7 | 0.871590 | 0 | 5 | 1.224600 | 5 | 0 |
# 표준화
scaler = MinMaxScaler()
scaler.fit(home1)
scaled = scaler.transform(home1)
Scree plot을 기준으로 급격한 기울기로 꺾이는 지점을 elbow point라고 하며 이 때의 군집 수를 사용
def elbow(X):
sse = []
for i in range(1, 11):
km = KMeans(n_clusters=i, init='k-means++', random_state=0)
km.fit(X)
sse.append(km.inertia_)
plt.plot(range(1,11), sse, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within Group Sum of Squares')
plt.show()
elbow(scaled)
km = KMeans(n_clusters=4).fit(scaled)
print(km.cluster_centers_)
[[0.16742899 0.16111111 0.16953405 0.40391392 0.34656085 0.17037037] [0.19595621 0.1 0.21039427 0.1869128 0.18095238 0.08888889] [0.16943311 0.13333333 0.17204301 0.64665691 0.63492063 0.32962963] [0.39884896 0.34294872 0.35525227 0.1769826 0.20695971 0.07264957]]
predict = pd.DataFrame(km.predict(scaled))
predict.columns = ['predict']
scaled = pd.DataFrame(data=scaled)
result = pd.concat([scaled, predict], axis=1)
result.columns = ['npxG','scored','deep','npxGA','deep_allowed','missed','Cluster']
result['teamId'] = data['teamId']
result
| npxG | scored | deep | npxGA | deep_allowed | missed | Cluster | teamId | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.367684 | 0.500 | 0.354839 | 0.198122 | 0.238095 | 0.111111 | 3 | Liverpool |
| 1 | 0.187760 | 0.000 | 0.032258 | 0.595293 | 0.428571 | 0.555556 | 2 | Norwich |
| 2 | 0.212235 | 0.125 | 0.129032 | 0.388191 | 0.285714 | 0.111111 | 0 | Man City |
| 3 | 0.137126 | 0.375 | 0.000000 | 0.259728 | 0.428571 | 0.000000 | 0 | West Ham |
| 4 | 0.130576 | 0.000 | 0.161290 | 0.294181 | 0.238095 | 0.000000 | 1 | Bournemouth |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 283 | 0.135209 | 0.000 | 0.064516 | 0.064026 | 0.190476 | 0.000000 | 1 | Man City |
| 284 | 0.558776 | 0.125 | 0.193548 | 0.080775 | 0.190476 | 0.111111 | 3 | Aston Villa |
| 285 | 0.331894 | 0.500 | 0.290323 | 0.098471 | 0.142857 | 0.000000 | 3 | Chelsea |
| 286 | 0.281457 | 0.250 | 0.096774 | 0.133544 | 0.095238 | 0.000000 | 1 | Leicester |
| 287 | 0.266204 | 0.500 | 0.258065 | 0.155405 | 0.095238 | 0.000000 | 3 | Man Utd |
288 rows × 8 columns
cluster_result = km.cluster_centers_
cluster_result = pd.DataFrame(data=cluster_result,
columns=['npxG','scored','deep','npxGA','deep_allowed','missed'])
cluster_result.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 0.167429 | 0.161111 | 0.169534 | 0.403914 | 0.346561 | 0.170370 |
| 1 | 0.195956 | 0.100000 | 0.210394 | 0.186913 | 0.180952 | 0.088889 |
| 2 | 0.169433 | 0.133333 | 0.172043 | 0.646657 | 0.634921 | 0.329630 |
| 3 | 0.398849 | 0.342949 | 0.355252 | 0.176983 | 0.206960 | 0.072650 |
# Liverpool 팀 정보 추출
Liv = result[result['teamId']=='Liverpool']
Liv = Liv.mean()
Liv = pd.DataFrame(data=[Liv[:-1]], columns=['npxG','scored','deep','npxGA','deep_allowed','missed'])
Liv.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 0.27114 | 0.276786 | 0.251152 | 0.272995 | 0.272109 | 0.119048 |
# Man City 팀 정보 추출
Mc = result[result['teamId']=='Man City']
Mc = Mc.mean()
Mc = pd.DataFrame(data=[Mc[:-1]], columns=['npxG','scored','deep','npxGA','deep_allowed','missed'])
Mc.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 0.211718 | 0.183333 | 0.23871 | 0.293671 | 0.304762 | 0.133333 |
# Norwich 팀 정보 추출
Nor = result[result['teamId']=='Norwich']
Nor = Nor.mean()
Nor = pd.DataFrame(data=[Nor[:-1]], columns=['npxG','scored','deep','npxGA','deep_allowed','missed'])
Nor.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 0.240372 | 0.178571 | 0.193548 | 0.317782 | 0.251701 | 0.134921 |
# Bournemouth 팀 정보 추출
Bou = result[result['teamId']=='Bournemouth']
Bou = Bou.mean()
Bou = pd.DataFrame(data=[Bou[:-1]], columns=['npxG','scored','deep','npxGA','deep_allowed','missed'])
Bou.head()
| npxG | scored | deep | npxGA | deep_allowed | missed | |
|---|---|---|---|---|---|---|
| 0 | 0.330171 | 0.2 | 0.225806 | 0.218424 | 0.203175 | 0.081481 |
def Visualization(cluster, team, team_name):
categories = cluster.columns
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r = cluster.values[0],
theta = categories,
fill='toself',
name='중하위권'
))
fig.add_trace(go.Scatterpolar(
r = cluster.values[1],
theta = categories,
fill='toself',
name='중상위권'
))
fig.add_trace(go.Scatterpolar(
r = cluster.values[2],
theta = categories,
fill='toself',
name='하위권'
))
fig.add_trace(go.Scatterpolar(
r = cluster.values[3],
theta = categories,
fill='toself',
name='상위권'
))
fig.add_trace(go.Scatterpolar(
r = team.values[0],
theta = categories,
fill='toself',
name=team_name
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 0.7]
)),
)
fig.show()
Visualization(cluster_result, Liv, 'Liverpool')
Visualization(cluster_result, Mc, 'Man City')
Visualization(cluster_result, Nor, 'Norwitch')
Visualization(cluster_result, Bou, 'Bournemouth')
참고 자료 : https://western-sky.tistory.com/42?category=847897 데이터 : https://www.kaggle.com/datasets/idoyo92/epl-stats-20192020